Authors: Jose Pérez Cano & Álvaro Ribot Barrado
0. Libraries
install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
# LDA/ QDA
library(MASS)
# RDA
library(klaR)
# Multinomial
library(nnet)
# Cross-Validation
library(TunePareto)
# Naive Bayes
library(e1071)
# k-NN
library(class)
# Correspondence analysis
library(ca)
# Cross-validation nn
library(caret)
Loading required package: lattice
Loading required package: ggplot2
1. Read data
set.seed(2105)
setwd("../data")
The working directory was changed to /Users/joseperezcano/Desktop/CFIS segundo curso/AA1/Project/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
clev <- read.csv("cleveland.csv", header=F)
head(clev)
2. Preprocess data
The dataset has missings that need to be treated, columns with too many should be removed and columns with a few should have them imputated.
source("Preprocessing.R")
# Missings
clev <- clev[,much.na.cols(clev,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- remove.var(clev, dummy)
clev <- knn.imputation(clev, 7)
# Multicollinearity
#corr.factors <- cor(clev)
#which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
clev <- remove.var(clev, c("V57", "V55"))
# Factors
factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
clev[,f] <- as.factor(clev[,f])
}
clev <- move.value(clev, "V25", 2, 1)
summary(clev)
V3 V4 V9 V10 V11 V12
Min. :29.00 0: 91 1: 22 Min. : 94.0 0:108 Min. :126.0
1st Qu.:48.00 1:191 2: 43 1st Qu.:120.0 1:174 1st Qu.:213.0
Median :55.00 3: 84 Median :130.0 Median :244.0
Mean :54.41 4:133 Mean :131.6 Mean :249.1
3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:277.0
Max. :77.00 Max. :200.0 Max. :564.0
V14 V15 V16 V18 V19 V20
Min. : 0.00 Min. : 0.00 0:240 0:107 0:138 1 :38
1st Qu.: 0.00 1st Qu.: 0.00 1: 42 1:175 1: 2 12 :33
Median :10.00 Median :15.00 2:142 2 :31
Mean :16.64 Mean :15.08 8 :30
3rd Qu.:30.00 3rd Qu.:30.00 6 :26
Max. :99.00 Max. :54.00 11 :25
(Other):99
V21 V22 V23 V24 V25 V26 V27 V29
4 : 13 81:67 0:271 0:186 0:211 0:252 0:248 Min. : 1.800
15 : 13 82:96 1: 11 1: 95 1: 71 1: 30 1: 34 1st Qu.: 6.500
20 : 13 83:86 2: 1 Median : 8.500
13 : 12 84:33 Mean : 8.418
16 : 12 3rd Qu.:10.075
21 : 12 Max. :15.000
(Other):207
V31 V32 V33 V34
Min. : 3.000 Min. : 71.0 Min. : 40.00 Min. : 84.0
1st Qu.: 7.000 1st Qu.:133.2 1st Qu.: 65.00 1st Qu.:154.0
Median : 9.000 Median :153.5 Median : 74.00 Median :168.0
Mean : 9.754 Mean :149.8 Mean : 75.12 Mean :168.1
3rd Qu.:12.000 3rd Qu.:165.8 3rd Qu.: 84.00 3rd Qu.:184.0
Max. :18.000 Max. :202.0 Max. :119.00 Max. :232.0
V35 V37 V38 V39 V40 V41
Min. : 26.00 Min. : 50.00 0:190 0:276 Min. :0.000 1:135
1st Qu.: 70.00 1st Qu.: 80.00 1: 92 1: 6 1st Qu.:0.000 2:129
Median : 80.00 Median : 85.00 Median :0.800 3: 18
Mean : 78.74 Mean : 84.95 Mean :1.027
3rd Qu.: 85.00 3rd Qu.: 90.00 3rd Qu.:1.600
Max. :120.00 Max. :110.00 Max. :6.200
V43 V44 V51 V56 V58 V59 V60
Min. : 24.0 Min. :0.0000 1: 2 5 : 14 0:157 1:270 1:242
1st Qu.: 92.0 1st Qu.:0.0000 3:159 21 : 14 1: 50 2: 12 2: 40
Median :118.0 Median :0.0000 6: 14 14 : 12 2: 31
Mean :123.6 Mean :0.6702 7:107 1 : 11 3: 32
3rd Qu.:152.8 3rd Qu.:1.0000 17 : 11 4: 12
Max. :270.0 Max. :3.0000 30 : 11
(Other):209
V61 V63 V65 V67 V68
1:224 1:238 1:236 1:233 1:246
2: 58 2: 44 2: 46 2: 49 2: 36
2.1 Visualizations
source("Visualizations.R")
histograms(clev)



boxplot.num(clev)



histograms(clev, F)



show.cor(clev)
row col
V34 10 2
V37 12 2
V15 5 4
V14 4 5
V31 7 6
V29 6 7
V32 8 7
V31 7 8
V10 2 10
V37 12 11
V10 2 12
V35 11 12
row col
2.2 Modification of values



Boxcox



Variables with many zeros:

This is the final dataset.
summary(clev)
V3.V1 V4 V9 V10.V1 V11
Min. :-2.8069276 0: 91 1: 22 Min. :-3.901751 0:108
1st Qu.:-0.7081950 1:191 2: 43 1st Qu.:-0.551231 1:174
Median : 0.0650223 3: 84 Median : 0.040911
Mean : 0.0000000 4:133 Mean : 0.000000
3rd Qu.: 0.7277800 3rd Qu.: 0.555074
Max. : 2.4951338 Max. : 2.864084
V12.V1 V14.V1 V15.V1 V16 V18
Min. :-4.643087 Min. :-1.0513034 Min. :-0.9838265 0:240 0:107
1st Qu.:-0.650497 1st Qu.:-1.0513034 1st Qu.:-0.9838265 1: 42 1:175
Median : 0.006803 Median : 0.0726030 Median :-0.0053205
Mean : 0.000000 Mean : 0.0000000 Mean : 0.0000000
3rd Qu.: 0.617022 3rd Qu.: 0.8953597 3rd Qu.: 0.9731855
Max. : 4.315880 Max. : 2.4849857 Max. : 2.5387951
V19 V20 V21 V22 V23 V24 V25 V26
0:138 1 :38 4 : 13 81:67 0:271 0:186 0:211 0:252
1: 2 12 :33 15 : 13 82:96 1: 11 1: 95 1: 71 1: 30
2:142 2 :31 20 : 13 83:86 2: 1
8 :30 13 : 12 84:33
6 :26 16 : 12
11 :25 21 : 12
(Other):99 (Other):207
V27 V29.V1 V31.V1 V32.V1
0:248 Min. :-2.5548211 Min. :-4.123390 Min. :-3.435980
1: 34 1st Qu.:-0.7405477 1st Qu.:-0.858782 1st Qu.:-0.720470
Median : 0.0314835 Median :-0.124711 Median : 0.162889
Mean : 0.0000000 Mean : 0.000000 Mean : 0.000000
3rd Qu.: 0.6394581 3rd Qu.: 0.774339 3rd Qu.: 0.697266
Max. : 2.5405850 Max. : 2.200026 Max. : 2.278588
V33.V1 V34.V1 V35.V1 V37.V1
Min. :-2.545485 Min. :-3.565358 Min. :-3.956482 Min. :-3.687203
1st Qu.:-0.733340 1st Qu.:-0.596232 1st Qu.:-0.655956 1st Qu.:-0.521933
Median :-0.080968 Median :-0.002407 Median : 0.094164 Median : 0.005612
Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000
3rd Qu.: 0.643890 3rd Qu.: 0.676251 3rd Qu.: 0.469224 3rd Qu.: 0.533157
Max. : 3.180892 Max. : 2.712223 Max. : 3.094643 Max. : 2.643337
V38 V39 V40.V1 V41 V43.V1
0:190 0:276 Min. :-1.1986205 1:135 Min. :-3.732861
1: 92 1: 6 1st Qu.:-1.1986205 2:129 1st Qu.:-0.539950
Median : 0.1781269 3: 18 Median : 0.021224
Mean : 0.0000000 Mean : 0.000000
3rd Qu.: 0.7483943 3rd Qu.: 0.660731
Max. : 2.6340820 Max. : 2.340441
V44.V1 V51 V56 V58 V59 V60 V61
Min. :-0.7158259 1: 2 5 : 14 0:157 1:270 1:242 1:224
1st Qu.:-0.7158259 3:159 21 : 14 1: 50 2: 12 2: 40 2: 58
Median :-0.7158259 6: 14 14 : 12 2: 31
Mean : 0.0000000 7:107 1 : 11 3: 32
3rd Qu.: 0.3522318 17 : 11 4: 12
Max. : 2.4883471 30 : 11
(Other):209
V63 V65 V67 V68
1:238 1:236 1:233 1:246
2: 44 2: 46 2: 49 2: 36
3. Resampling protocol
source("Resampling.R")
4. Models
The models we are going to use are: - LDA - QDA - RDA - k-NN - Naïve Bayes - GLM - Neural Networks
rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
cross.validation.naive(train, train$V58, naive.model, 10, 10)
err <- c()
for (k in 1:20){
err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")
err
cross.validation.knn(train, train$V58, 10, 10, 1)
multinomial.model <- multinom(V58~., data=train)
cross.validation(train, train$V58, multinomial.model, 10, 10, F)
multinomial.model.step <- step(multinomial.model)
cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)
multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)
cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)
multinomial.model.noFDA.step <- step(multinomial.model.noFDA)
cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
Test error
rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
(err.test <- 1-sum(diag(err.table))/sum(err.table))
Hungarian data
hung <- read.csv("../data/hungarian.csv", header=F)
head(hung)
# Missings
hung <- hung[,much.na.cols(hung,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28")
hung <- remove.var(hung, dummy)
hung <- knn.imputation(hung, 7)
# Multicollinearity
corr.factors <- cor(hung)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
row col
V55 32 12
V57 34 14
V43 31 30
V42 30 31
V20 12 32
V22 14 34
hung <- remove.var(hung, c("V57", "V55"))
# Factors
factores <- c("V58", "V4", "V9", "V16", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V56", "V11")
for (f in factores){
hung[,f] <- as.factor(hung[,f])
}
hung <- move.value(hung, "V25", 2, 1)
summary(hung)
V3 V4 V5 V6 V7
Min. :28.00 0: 81 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:42.00 1:213 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :49.00 Median :1.0000 Median :0.0000 Median :1.0000
Mean :47.83 Mean :0.9218 Mean :0.4422 Mean :0.5204
3rd Qu.:54.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :66.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
V9 V10 V11 V12 V16 V19 V20
1: 11 Min. : 17.0 0:195 Min. : 8.0 0:266 0:235 3 :48
2:106 1st Qu.:120.0 1: 99 1st Qu.:198.0 1: 28 1: 53 5 :36
3: 54 Median :130.0 Median :237.0 2: 6 4 :34
4:123 Mean :132.2 Mean :236.8 7 :29
3rd Qu.:140.0 3rd Qu.:277.0 11 :27
Max. :200.0 Max. :603.0 6 :23
(Other):97
V21 V22 V23 V24 V25 V26 V27 V29
16 : 20 83:21 0:293 0:274 0:265 0:269 0:290 Min. : 1.00
2 : 14 84:62 1: 1 1: 20 1: 29 1: 25 1: 4 1st Qu.: 7.00
3 : 13 85:84 Median :10.00
9 : 13 86:91 Mean :10.62
12 : 11 87:36 3rd Qu.:13.00
21 : 11 Max. :24.00
(Other):212
V31 V32 V33 V34
Min. : 2.000 Min. : 47.0 Min. : 27.00 Min. : 17.0
1st Qu.: 5.000 1st Qu.:122.0 1st Qu.: 70.00 1st Qu.:160.0
Median : 6.000 Median :140.0 Median : 80.00 Median :180.0
Mean : 5.699 Mean :138.8 Mean : 80.85 Mean :180.7
3rd Qu.: 7.000 3rd Qu.:155.0 3rd Qu.: 92.00 3rd Qu.:200.0
Max. :11.000 Max. :190.0 Max. :134.00 Max. :240.0
V35 V37 V38 V39 V40
Min. : 5.0 Min. : 7.00 0:204 0:291 Min. :0.0000
1st Qu.: 90.0 1st Qu.: 80.00 1: 90 1: 3 1st Qu.:0.0000
Median : 95.0 Median : 80.00 Median :0.0000
Mean : 95.4 Mean : 84.13 Mean :0.5861
3rd Qu.:100.0 3rd Qu.: 90.00 3rd Qu.:1.0000
Max. :134.0 Max. :110.00 Max. :5.0000
V42 V43 V56 V58
Min. : 3.00 Min. : 2.0 20 : 33 0:188
1st Qu.:10.00 1st Qu.: 9.0 30 : 32 1: 37
Median :13.00 Median :13.0 10 : 25 2: 26
Mean :13.69 Mean :13.3 16 : 16 3: 28
3rd Qu.:17.00 3rd Qu.:17.0 25 : 16 4: 15
Max. :31.00 Max. :30.0 15 : 15
(Other):157
histograms(hung)



boxplot.num(hung)



histograms(hung, F)


show.cor(hung)
row col
V7 4 3
V6 3 4
V37 13 5
V31 8 7
V29 7 8
V35 12 11
V34 11 12
V37 13 12
V10 5 13
V35 12 13
V43 16 15
V42 15 16
row col







hung <- apply.trans(hung, sqrt.neg.vars=c("V10", "V12"), sqrt.vars = c("V31", "V29", "V42", "V43"), log.vars = c("V6", "V7", "V40"))
hung <- scale.num(hung)
hung <- remove.var(hung, c("V23", "V39"))
hung <- move.value(hung, "V19", 2, 1)
summary(hung)
V3.V1 V4 V5.V1 V6.V1
Min. :-2.5380193 0: 81 Min. :-3.426738 Min. :-0.8888120
1st Qu.:-0.7458616 1:213 1st Qu.: 0.290830 1st Qu.:-0.8888120
Median : 0.1502173 Median : 0.290830 Median :-0.8888120
Mean : 0.0000000 Mean : 0.000000 Mean : 0.0000000
3rd Qu.: 0.7902736 3rd Qu.: 0.290830 3rd Qu.: 1.1212705
Max. : 2.3264088 Max. : 0.290830 Max. : 1.1212705
V7.V1 V9 V10.V1 V11 V12.V1
Min. :-1.0399113 1: 11 Min. :-10.485789 0:195 Min. :-4.891897
1st Qu.:-1.0399113 2:106 1st Qu.: -0.525584 1: 99 1st Qu.:-0.341989
Median : 0.9583496 3: 54 Median : -0.053231 Median : 0.103229
Mean : 0.0000000 4:123 Mean : 0.000000 Mean : 0.000000
3rd Qu.: 0.9583496 3rd Qu.: 0.398645 3rd Qu.: 0.521963
Max. : 0.9583496 Max. : 2.790781 Max. : 3.159997
V16 V19 V20 V21 V22 V24 V25 V26
0:266 0:235 3 :48 16 : 20 83:21 0:274 0:265 0:269
1: 28 1: 59 5 :36 2 : 14 84:62 1: 20 1: 29 1: 25
4 :34 3 : 13 85:84
7 :29 9 : 13 86:91
11 :27 12 : 11 87:36
6 :23 21 : 11
(Other):97 (Other):212
V27 V29.V1 V31.V1 V32.V1
0:290 Min. :-2.8823019 Min. :-2.3234920 Min. :-3.801186
1: 4 1st Qu.:-0.6976042 1st Qu.:-0.2892675 1st Qu.:-0.696194
Median :-0.0119271 Median : 0.2389864 Median : 0.049004
Mean : 0.0000000 Mean : 0.0000000 Mean : 0.000000
3rd Qu.: 0.5765086 3rd Qu.: 0.7247660 3rd Qu.: 0.670002
Max. : 2.2935054 Max. : 2.3852879 Max. : 2.118998
V33.V1 V34.V1 V35.V1 V37.V1
Min. :-3.402790 Min. :-5.995364 Min. :-6.729394 Min. :-7.492772
1st Qu.:-0.685630 1st Qu.:-0.758793 1st Qu.:-0.402310 1st Qu.:-0.401452
Median :-0.053733 Median :-0.026406 Median :-0.030129 Median :-0.401452
Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000
3rd Qu.: 0.704544 3rd Qu.: 0.705982 3rd Qu.: 0.342052 3rd Qu.: 0.569962
Max. : 3.358514 Max. : 2.170757 Max. : 2.872886 Max. : 2.512790
V38 V40.V1 V42.V1 V43.V1
0:204 Min. :-0.7022515 Min. :-2.4537060 Min. :-2.8735946
1: 90 1st Qu.:-0.7022515 1st Qu.:-0.5948207 1st Qu.:-0.7585268
Median :-0.7022515 Median :-0.0186920 Median : 0.0491369
Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
3rd Qu.: 0.7554881 3rd Qu.: 0.6539804 3rd Qu.: 0.7394332
Max. : 3.0659508 Max. : 2.5316230 Max. : 2.5455122
V56 V58
20 : 33 0:188
30 : 32 1: 37
10 : 25 2: 26
16 : 16 3: 28
25 : 16 4: 15
15 : 15
(Other):157
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)
hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"
col.class2 <- as.numeric(train$V58)
col.class2[col.class2==1] <- "red"
col.class2[col.class2==2] <- "green"
col.class2[col.class2==3] <- "blue"
col.class2[col.class2==4] <- "yellow"
col.class2[col.class2==5] <- "purple"
pca <- pca.num(hung)
plot.pca(hung, col.class2, pca = pca)

hung <- extract.pca(pca, hung)
fda <- plot.fda(train, V58~.-V56, col.class)
variables are collinear

train <- extract.fda(fda, train)
test <- extract.fda(fda, test)
plot(test$LD1, test$LD2, col=c("red","green","yellow","blue","purple")[as.numeric(test$V58)], xlab="LD1", ylab="LD2")
legend("topleft", legend=c("0","1","2","3","4"), fill=c("red","green","yellow","blue","purple"))



rda.model <- rda(V58~.-LD1-LD2-LD3-LD4, data=train)
naive.model <- naiveBayes(V58~., data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 2, 10, T)
Error in cross.validation(train, train$V58, rda.model.fda, 2, 10, T) :
no se pudo encontrar la función "cross.validation"
cross.validation.naive(train, train$V58, naive.model, 10, 10)
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 1, mean error: 0.306052631578947"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 2, mean error: 0.311315789473684"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 3, mean error: 0.305263157894737"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 4, mean error: 0.325526315789474"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 5, mean error: 0.30578947368421"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 6, mean error: 0.333157894736842"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 7, mean error: 0.321052631578947"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 8, mean error: 0.326578947368421"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 9, mean error: 0.317368421052632"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 10, mean error: 0.32"
[1] 0.3172105
err <- c()
for (k in 1:20){
err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")

err
[1] 0.3358684 0.3461842 0.2990263 0.2943158 0.3140526 0.3193947 0.3229737
[8] 0.3300526 0.3405526 0.3535000 0.3626053 0.3702368 0.3702368 0.3772895
[15] 0.3814474 0.3809474 0.3839737 0.3849737 0.3844737 0.3865000
multinomial.model <- multinom(V58~., data=train)
cross.validation(train, train$V58, multinomial.model, 10, 10, F)
multinomial.model.step <- step(multinomial.model)
cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)
multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)
cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)
multinomial.model.noFDA.step <- step(multinomial.model.noFDA)
cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
qda.model <- qda(V58~.-LD1-LD2-LD3-LD4, data=train)
Redes neuronales
decays <- c(0.0001, 0.001, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet',
trace=F, maxit=1000,
tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
trc <- trainControl (method="repeatedcv", number=5, repeats=1)
decays <- c(0, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~., data = train, method = 'nnet',
trace=F, maxit=1000, MaxNWt=2000,
tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
decays <- c(1, 10, 100, 1000)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet',
trace=F, maxit=1000,
tuneGrid = expand.grid(.size=9,.decay=0), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~., data=train, maxit=1000, size=9, decay=1, MaxNWt=2000)
getvalue <- function(row){
if (row[1]){
0
} else if (row[2]){
1
} else if (row[3]){
2
} else if (row[4]){
3
} else if (row[5]){
4
} else {
5
}
}
table(train$V58, apply(nn$fitted.values, 1, which.max)-1)
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
(err.test <- 1 - sum(diag(tab))/sum(tab))
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
decays <- c(0.67, 0.66, 0.68)
sizes <- c(1, 2, 3, 4,5,6,7)
nn.model10x10CV <- train(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data = train,
method = 'nnet',
trace=F, maxit=1000, MaxNWts=10000,
tuneGrid = expand.grid(.size=20,.decay=decays),
trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data=train, maxit=1000, size=20, decay=0.68, MaxNWts=10000)
# weights: 685
initial value 439.074092
iter 10 value 181.772622
iter 20 value 163.301853
iter 30 value 160.316728
iter 40 value 159.762315
iter 50 value 159.510276
iter 60 value 159.406120
iter 70 value 159.392548
iter 80 value 159.385878
iter 90 value 159.382937
iter 100 value 159.382143
iter 110 value 159.381815
iter 120 value 159.381599
final value 159.381515
converged
(tab <- table(train$V58, apply(nn$fitted.values, 1, which.max)-1))
0 1 2 3 4
0 116 2 0 1 0
1 9 14 3 2 0
2 6 3 8 3 0
3 4 3 3 11 0
4 1 0 1 4 2
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.2295918
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
0 1 2 3
0 63 3 2 1
1 5 3 0 1
2 1 2 0 3
3 2 1 1 3
4 0 3 2 2
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.2959184
target <- as.numeric(test$V58)-1
target[target > 1] <- 1
pred <- apply(pred, 1, which.max)-1
pred[pred>1] <- 1
(tab <- table(target, pred))
pred
target 0 1
0 63 6
1 8 21
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1428571
Two-classes
aux <- hung
aux$V58[aux$V58 != 0] <- 1
aux$V58 <- droplevels(aux$V58)
set.seed(2000)
n <- nrow(aux)
train.lenght <- round(2*n/3)
aux <- aux[sample(n),]
train_aux <- aux[1:train.lenght,]
test_aux <- aux[(train.lenght+1):n,]
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0, MaxNWts = 10000)
# weights: 901
initial value 184.168596
iter 10 value 39.613298
iter 20 value 0.489019
iter 30 value 0.008910
iter 40 value 0.001337
iter 50 value 0.000283
iter 60 value 0.000194
final value 0.000089
converged
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
0 1
0 119 0
1 0 77
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0
decays <- c(0.52, 0.55, 0.57, 0.6)
sizes <- c(4,5, 6, 7)
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
nn.model10x10CV <- train(V58~.-V56-V20-V21-V22, data = train_aux,
method = 'nnet',
trace=F, maxit=1000, MaxNWts=10000000,
tuneGrid = expand.grid(.size=30,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0.55, MaxNWt=10000)
# weights: 901
initial value 222.113045
iter 10 value 81.643410
iter 20 value 72.298607
iter 30 value 71.251664
iter 40 value 70.796652
iter 50 value 70.483355
iter 60 value 70.415911
iter 70 value 70.401168
iter 80 value 70.388071
iter 90 value 70.383196
iter 100 value 70.382250
iter 110 value 70.382108
iter 120 value 70.382065
final value 70.382063
converged
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
0 1
0 110 9
1 13 64
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1122449
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
0 1
0 110 9
1 13 64
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1122449
pred <- predict(nn, test)
(tab<- table(test_aux$V58, (pred > 0.5)*1))
0 1
0 59 10
1 7 22
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1734694